# Load libraries
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(gridExtra)
library(scales)
library(Cairo)
library(grid)
library(vcd)
library(countrycode)
# Load data and add labels
# TODO: Use non-unicode-mangled data
load(file.path(PROJHOME, "data_raw", "responses_orgs.RData"))
load(file.path(PROJHOME, "data_raw", "responses_countries.RData"))
responses.orgs.labs <- read_csv(file.path(PROJHOME, "data_raw", "response_orgs_labels.csv"))
responses.countries.labs <- read_csv(file.path(PROJHOME, "data_raw", "response_countries_labels.csv"))
Hmisc::label(responses.orgs, self=FALSE) <- responses.orgs.labs$varlabel
Hmisc::label(responses.countries, self=FALSE) <- responses.countries.labs$varlabel
# Add a nicer short ID to each response to reference in the article
set.seed(1234)
nicer.ids <- data_frame(survey.id = responses.orgs$survey.id) %>%
mutate(clean.id = 1000 + sample(1:n()))
write_csv(nicer.ids, path=file.path(PROJHOME, "data", "id_lookup_WILL_BE_OVERWRITTEN.csv"))
The data is split into two sets: responses.orgs has a row for each surveyed organization and responses.countries has a row for each country organizations responded for (1-4 countries per organization). For ease of analysis, this combines them into one larger dataframe (so organization-level data is repeated). It also removes columns that were added manually, where an RA coded whether a country was mentioned in different questions (with a colum for each country!).
responses.all <- responses.orgs %>%
left_join(responses.countries, by="survey.id") %>%
select(-contains("_c", ignore.case=FALSE)) %>% # Get rid of all the dummy vars
left_join(nicer.ids, by="survey.id") %>%
select(survey.id, clean.id, everything())
Convert some responses into numeric indexes:
importance <- data_frame(Q3.19 = levels(responses.countries$Q3.19),
importance = c(2, 1, 0, NA))
importance.levels <- data_frame(start=c(0, 1, 2),
end=c(1, 2, 3),
level=c("Not important", "Somewhat important",
"Most important"),
level.ordered=factor(level, levels=level, ordered=TRUE))
positivity <- data_frame(Q3.25 = levels(responses.countries$Q3.25),
positivity = c(-1, 1, 0, NA))
improvement <- data_frame(Q3.26 = levels(responses.countries$Q3.26),
improvement = c(1, 0, -1, NA))
# Cho data
tip.change <- read_csv(file.path(PROJHOME, "data", "policy_index.csv")) %>%
group_by(countryname) %>%
summarise(avg_tier = mean(tier, na.rm=TRUE),
change_tip = -(last(na.omit(tier), default=NA) -
first(na.omit(tier), default=NA)),
change_policy = last(na.omit(p), default=NA) -
first(na.omit(p), default=NA)) %>%
mutate(cow = countrycode(countryname, "country.name", "cown"))
# Funding
funding.raw <- read_csv(file.path(PROJHOME, "data_raw", "funding_clean.csv")) %>%
mutate(cowcode = ifelse(country == "Serbia", 555, cowcode),
countryname = countrycode(cowcode, "cown", "country.name"),
countryname = ifelse(cowcode == 555, "Serbia", countryname)) %>%
filter(!is.na(countryname))
funding.all <- funding.raw %>%
group_by(countryname) %>%
summarise(total.funding = sum(amount, na.rm=TRUE),
avg.funding = mean(amount, na.rm=TRUE))
funding.ngos <- funding.raw %>%
filter(recipient_type %in% c("NGO", "NPO")) %>%
group_by(countryname) %>%
summarise(total.funding.ngos = sum(amount, na.rm=TRUE),
avg.funding.ngos = mean(amount, na.rm=TRUE))
responses.full <- responses.all %>%
left_join(tip.change, by=c("work.country" = "countryname")) %>%
left_join(funding.all, by=c("work.country" = "countryname")) %>%
left_join(funding.ngos, by=c("work.country" = "countryname")) %>%
left_join(positivity, by = "Q3.25") %>%
left_join(importance, by = "Q3.19") %>%
left_join(improvement, by = "Q3.26") %>%
mutate(received.funding = ifelse(Q3.18_3 != 1 | is.na(Q3.18_3), FALSE, TRUE),
us.involvement = ifelse(Q3.18_5 != 1 | is.na(Q3.18_5), TRUE, FALSE),
Q3.19 = factor(Q3.19, levels=c("Most important actor",
"Somewhat important actor",
"Not an important actor",
"Don't know"),
ordered=TRUE),
Q3.25_collapsed = ifelse(Q3.25 == "Negative", NA, Q3.25))
country.indexes <- responses.full %>%
filter(!is.na(work.country)) %>%
group_by(work.country) %>%
# Needs mutate + mutate_each + select + unique because you can't mix
# summarise + summarise_each. See http://stackoverflow.com/a/31815540/120898
mutate(num.responses = n()) %>%
mutate_each(funs(score = mean(., na.rm=TRUE), stdev = sd(., na.rm=TRUE),
n = sum(!is.na(.))),
c(positivity, importance, improvement)) %>%
select(work.country, num.responses, matches("positivity_|importance_|improvement_")) %>%
unique %>%
ungroup() %>%
arrange(desc(num.responses))
# Useful functions
theme_clean <- function(base_size=9, base_family="Source Sans Pro Light") {
ret <- theme_bw(base_size, base_family) +
theme(panel.background = element_rect(fill="#ffffff", colour=NA),
axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5),
title=element_text(vjust=1.2, family="Source Sans Pro Semibold"),
panel.border = element_blank(), axis.line=element_blank(),
panel.grid.minor=element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(size=0.25, colour="grey90"),
axis.ticks=element_blank(),
legend.position="bottom",
axis.title=element_text(size=rel(0.8), family="Source Sans Pro Semibold"),
strip.text=element_text(size=rel(0.9), family="Source Sans Pro Semibold"),
strip.background=element_rect(fill="#ffffff", colour=NA),
panel.margin=unit(1, "lines"), legend.key.size=unit(.7, "line"),
legend.key=element_blank())
ret
}
# Return a data frame of counts and proportions for multiple responses
separate.answers.summary <- function(df, cols, labels, total=FALSE) {
cols.to.select <- which(colnames(df) %in% cols)
denominator <- df %>%
select(cols.to.select) %>%
mutate(num.answered = rowSums(., na.rm=TRUE)) %>%
filter(num.answered > 0) %>%
nrow()
df <- df %>%
select(survey.id, cols.to.select) %>%
gather(question, value, -survey.id) %>%
mutate(question = factor(question, labels=labels, ordered=TRUE)) %>%
group_by(question) %>%
summarize(response = sum(value, na.rm=TRUE),
pct = round(response / denominator * 100, 2),
plot.pct = response / denominator)
colnames(df) <- c("Answer", "Responses", "%", "plot.pct")
if (total) {
df <- df %>% select(1:3)
df <- rbind(as.matrix(df), c("Total responses", denominator, "—"))
}
return(list(df=df, denominator=denominator))
}
# Create a character vector of significance stars
add.stars <- function(x) {
as.character(symnum(x, corr = FALSE,
cutpoints = c(0, .001,.01,.05, .1, 1),
symbols = c("***","**","*","."," ")))
}
# Select just the columns that have cowcodes embedded in them
active.embassies.raw <- responses.countries %>%
select(contains("_c", ignore.case=FALSE)) %>%
mutate_each(funs(as.numeric(levels(.))[.])) # Convert values to numeric
# Select only the rows where they responded (i.e. not all columns are NA)
num.responses <- active.embassies.raw %>%
rowwise() %>% do(all.missing = all(!is.na(.))) %>%
ungroup() %>% mutate(all.missing = unlist(all.missing)) %>%
summarise(total = sum(all.missing))
# Tidy cowcode columns and summarize most commonly mentioned countries
active.embassies <- active.embassies.raw %>%
gather(country.raw, num) %>%
group_by(country.raw) %>% summarise(num = sum(num, na.rm=TRUE)) %>%
mutate(country.raw = str_replace(country.raw, "Q.*c", ""),
country = countrycode(country.raw, "cown", "country.name"),
country = ifelse(country.raw == "2070", "European Union", country)) %>%
ungroup() %>% mutate(prop = num / num.responses$total,
prop.nice = sprintf("%.1f%%", prop * 100))
Which embassies or foreign governments NGOs were reported as active partners in the fight against human trafficking?
active.embassies.top <- active.embassies %>%
arrange(num) %>% select(-country.raw) %>%
filter(num > 10) %>%
mutate(country = factor(country, levels=country, ordered=TRUE)) %>%
arrange(desc(num))
active.embassies %>% arrange(desc(num)) %>%
select(-country.raw) %>% filter(num > 10)
## Source: local data frame [12 x 4]
##
## num country prop prop.nice
## (dbl) (chr) (dbl) (chr)
## 1 260 United States 0.77380952 77.4%
## 2 43 United Kingdom 0.12797619 12.8%
## 3 35 Netherlands 0.10416667 10.4%
## 4 30 France 0.08928571 8.9%
## 5 26 Norway 0.07738095 7.7%
## 6 26 European Union 0.07738095 7.7%
## 7 24 Switzerland 0.07142857 7.1%
## 8 21 Sweden 0.06250000 6.2%
## 9 19 Australia 0.05654762 5.7%
## 10 17 Germany 0.05059524 5.1%
## 11 17 Italy 0.05059524 5.1%
## 12 12 Canada 0.03571429 3.6%
nrow(active.embassies) # Number of countries mentioned
## [1] 64
num.responses$total # Total responses
## [1] 336
# Most active embassies
# Save Q3.7 to a CSV for hand coding
most.active <- responses.countries %>%
select(Q3.7) %>%
filter(!is.na(Q3.7))
write_csv(most.active, path=file.path(PROJHOME, "data",
"most_active_WILL_BE_OVERWRITTEN.csv"))
# Read in hand-coded CSV
if (file.exists(file.path(PROJHOME, "data", "most_active.csv"))) {
most.active <- read_csv(file.path(PROJHOME, "data", "most_active.csv"))
} else {
stop("data/most_active.csv is missing")
}
# Split comma-separated countries, unnest them into multiple rows, and
# summarize most active countries
most.active.clean <- most.active %>%
transform(clean = strsplit(clean, ",")) %>%
unnest(clean) %>%
mutate(clean = str_trim(clean)) %>%
group_by(clean) %>%
summarise(total = n()) %>%
mutate(prop = total / nrow(most.active),
prop.nice = sprintf("%.1f%%", prop * 100))
Which countries or embassies have been the most active?
most.active.clean %>% arrange(desc(total))
## Source: local data frame [39 x 4]
##
## clean total prop prop.nice
## (chr) (int) (dbl) (chr)
## 1 United States 187 0.70300752 70.3%
## 2 None 16 0.06015038 6.0%
## 3 European Union 14 0.05263158 5.3%
## 4 All 12 0.04511278 4.5%
## 5 Australia 7 0.02631579 2.6%
## 6 Italy 7 0.02631579 2.6%
## 7 Switzerland 7 0.02631579 2.6%
## 8 United Kingdom 7 0.02631579 2.6%
## 9 Norway 6 0.02255639 2.3%
## 10 Philippines 6 0.02255639 2.3%
## .. ... ... ... ...
nrow(most.active.clean) - 1 # Subtract one because of "None"s
## [1] 38
Over the last 10–15 years, has the United States or its embassy been active in the fight against human trafficking in X?
responses.countries$Q3.8 %>% table %>% print %>% prop.table
## .
## No Yes Don't know
## 39 343 149
## .
## No Yes Don't know
## 0.07344633 0.64595104 0.28060264
Side-by-side graph of active countries + most active countries
plot.data <- active.embassies.top %>%
bind_rows(data_frame(num=0, country=c("All", "None"),
prop=0, prop.nice="")) %>%
arrange(num) %>%
mutate(country = factor(country, levels=country, ordered=TRUE))
plot.data.active <- most.active.clean %>%
filter(clean %in% plot.data$country) %>%
mutate(country = factor(clean, levels=levels(plot.data$country), ordered=TRUE))
fig.active <- ggplot(plot.data, aes(x=country, y=num)) +
geom_bar(stat="identity") +
geom_text(aes(label = prop.nice), size=3.5, hjust=1.3,
family="Source Sans Pro Light") +
labs(x=NULL, y="Number of times country was mentioned as a partner in anti-TIP work") +
scale_y_continuous(breaks=seq(0, max(active.embassies$num), by=25),
trans="reverse", expand = c(.1, .1)) +
coord_flip() +
theme_clean() +
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
plot.margin = unit(c(1,0.5,1,1), "lines"))
fig.most.active <- ggplot(plot.data.active, aes(x=country, y=total)) +
geom_bar(stat="identity") +
geom_text(aes(label = prop.nice), size=3.5, hjust=-0.3,
family="Source Sans Pro Light") +
labs(x=NULL, y="Number of times country was mentioned as the most active partner in anti-TIP work") +
scale_y_continuous(expand = c(.15, .15)) +
coord_flip() +
theme_clean() +
theme(axis.text.y = element_text(hjust=0.5),
axis.line.y = element_blank(),
plot.margin = unit(c(1,1,1,0), "lines"))
fig.embassies <- arrangeGrob(fig.active, fig.most.active, nrow=1)
grid.draw(fig.embassies)
ggsave(fig.embassies, filename=file.path(PROJHOME, "figures", "fig_embassies.pdf"),
width=5, height=2, units="in", device=cairo_pdf, scale=2.5)
ggsave(fig.embassies, filename=file.path(PROJHOME, "figures", "fig_embassies.png"),
width=5, height=2, units="in", scale=2.5)
Actual US activities
cols <- c("Q3.9_1", "Q3.9_2", "Q3.9_3", "Q3.9_4", "Q3.9_5",
"Q3.9_6", "Q3.9_7", "Q3.9_8", "Q3.9_9", "Q3.9_10")
labels <- c("Asking for legislation", "Convening conferences or workshops",
"Raising awareness", "Providing resources or funding",
"Increasing government attention", "Training government officials",
"Contributing to a government action plan", "Other", "Don't know",
"The US has not been involved in trafficking issues")
activities <- separate.answers.summary(responses.countries, cols, labels)
activities$denominator # Number of responses
## [1] 530
activities$df
## Source: local data frame [10 x 4]
##
## Answer Responses % plot.pct
## (fctr) (int) (dbl) (dbl)
## 1 Asking for legislation 164 30.94 0.30943396
## 2 Convening conferences or workshops 207 39.06 0.39056604
## 3 Raising awareness 213 40.19 0.40188679
## 4 Providing resources or funding 210 39.62 0.39622642
## 5 Increasing government attention 216 40.75 0.40754717
## 6 Training government officials 146 27.55 0.27547170
## 7 Contributing to a government action plan 113 21.32 0.21320755
## 8 Other 43 8.11 0.08113208
## 9 Don't know 26 4.91 0.04905660
## 10 The US has not been involved in trafficking issues 166 31.32 0.31320755
plot.data <- activities$df %>%
mutate(Answer=factor(Answer, levels=rev(labels), ordered=TRUE))
fig.activities <- ggplot(plot.data, aes(x=Answer, y=Responses)) +
geom_bar(aes(y=plot.pct), stat="identity") +
labs(x=NULL, y=NULL) +
scale_y_continuous(labels = percent,
breaks = seq(0, max(round(plot.data$plot.pct, 1)), by=0.1)) +
coord_flip() + theme_clean()
fig.activities
ggsave(fig.activities, filename=file.path(PROJHOME, "figures", "fig_activities.pdf"),
width=6.5, height=5, units="in", device=cairo_pdf)
ggsave(fig.activities, filename=file.path(PROJHOME, "figures", "fig_activities.png"),
width=6.5, height=5, units="in")
plot.data <- responses.full %>%
group_by(Q3.19) %>%
summarize(num = n()) %>%
na.omit() %>%
mutate(prop = num / sum(num),
prop.nice = sprintf("%.1f%%", prop * 100),
Q3.19 = factor(Q3.19, levels=rev(levels(Q3.19)), ordered=TRUE))
plot.data
## Source: local data frame [4 x 4]
##
## Q3.19 num prop prop.nice
## (fctr) (int) (dbl) (chr)
## 1 Most important actor 139 0.2673077 26.7%
## 2 Somewhat important actor 180 0.3461538 34.6%
## 3 Not an important actor 68 0.1307692 13.1%
## 4 Don't know 133 0.2557692 25.6%
fig.us_importance <- ggplot(plot.data, aes(x=Q3.19, y=prop)) +
geom_bar(stat="identity") +
labs(x=NULL, y=NULL) +
scale_y_continuous(labels = percent,
breaks = seq(0, max(round(plot.data$num, 1)), by=0.1)) +
coord_flip() + theme_clean()
fig.us_importance
ggsave(fig.us_importance, filename=file.path(PROJHOME, "figures", "fig_us_importance.pdf"),
width=6.5, height=5, units="in", device=cairo_pdf)
ggsave(fig.us_importance, filename=file.path(PROJHOME, "figures", "fig_us_importance.png"),
width=6.5, height=5, units="in")
Average importance by country
importance.plot <- country.indexes %>%
filter(num.responses >= 10) %>%
arrange(importance_score) %>%
mutate(country_label = factor(work.country, levels=unique(work.country),
labels=paste0(work.country, " (", num.responses, ")"),
ordered=TRUE))
fig.avg_importance <- ggplot(importance.plot, aes(x=country_label, y=importance_score)) +
geom_rect(data=importance.levels, aes(x=NULL, y=NULL, ymin=start, ymax=end,
xmin=0, xmax=Inf, fill=level.ordered), alpha=0.5) +
geom_pointrange(aes(ymax=importance_score + importance_stdev,
ymin=importance_score - importance_stdev)) +
labs(x="Country (number of responses)",
y="Importance of the US in anti-TIP efforts (mean)") +
scale_fill_manual(values=c("grey90", "grey60", "grey30"), name=NULL) +
coord_flip() +
theme_clean() + theme(legend.position="bottom")
fig.avg_importance
ggsave(fig.avg_importance, filename=file.path(PROJHOME, "figures", "fig_avg_importance.pdf"),
width=6.5, height=5, units="in", device=cairo_pdf)
ggsave(fig.avg_importance, filename=file.path(PROJHOME, "figures", "fig_avg_importance.png"),
width=6.5, height=5, units="in")
df.importance <- responses.full %>%
select(Q3.19, change_policy, avg_tier, change_tip, change_policy,
importance, received.funding, us.involvement, total.funding) %>%
filter(!is.na(Q3.19)) %>%
mutate(importance_factor = factor(Q3.19, ordered=FALSE),
log.total.funding = log1p(total.funding))
Average tier doesn’t show much because it doesn’t show any changes in time—just how bad the country is in general?
# http://www.r-bloggers.com/analysis-of-variance-anova-for-multiple-comparisons/
importance.means <- df.importance %>%
group_by(Q3.19) %>%
summarize(avg_points = mean(avg_tier, na.rm=TRUE),
var_points = var(avg_tier, na.rm=TRUE)) %>%
print
## Source: local data frame [4 x 3]
##
## Q3.19 avg_points var_points
## (fctr) (dbl) (dbl)
## 1 Most important actor 2.054501 0.1036754
## 2 Somewhat important actor 1.927769 0.2153692
## 3 Not an important actor 1.768449 0.2980403
## 4 Don't know 1.787980 0.2715889
Plot group means and distributions
fig.importance <- ggplot(df.importance, aes(x=Q3.19, y=avg_tier)) +
geom_violin(fill="grey90") +
geom_point(alpha=0.05, show.legend=FALSE) +
geom_point(data=importance.means, aes(x=Q3.19, y=avg_points), size=5, show.legend=FALSE) +
labs(x="Opinion of US importance", y="Average TIP tier rating") +
coord_flip() + theme_clean()
fig.importance
Those means appear slightly different from each other. Is that really the case? Check with ANOVA, which assumes homogenous variance across groups. Throw every possible test at it—if null is rejected (p < 0.05 or whatever) then variance is likely heterogenous:
bartlett.test(avg_tier ~ importance_factor, data=df.importance)
##
## Bartlett test of homogeneity of variances
##
## data: avg_tier by importance_factor
## Bartlett's K-squared = 32.694, df = 3, p-value = 3.737e-07
car::leveneTest(avg_tier ~ importance_factor, data=df.importance)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 10.224 1.586e-06 ***
## 454
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fligner.test(avg_tier ~ importance_factor, data=df.importance) # Uses median
##
## Fligner-Killeen test of homogeneity of variances
##
## data: avg_tier by importance_factor
## Fligner-Killeen:med chi-squared = 24.984, df = 3, p-value = 1.556e-05
kruskal.test(avg_tier ~ importance_factor, data=df.importance) # Nonparametric
##
## Kruskal-Wallis rank sum test
##
## data: avg_tier by importance_factor
## Kruskal-Wallis chi-squared = 16.197, df = 3, p-value = 0.001033
All of those p-values are tiny, so it’s clear that variance is not the same across groups. However, there’s a rule of thumb (super detailed example) that ANOVA is robust to heterogeneity of variance as long as the largest variance is less than four times the smallest variance.
Given that rule of thumb, the variance here isn’t that much of an issue
df.importance %>% group_by(importance_factor) %>%
summarise(variance = var(avg_tier, na.rm=TRUE)) %>%
do(data_frame(ratio = max(.$variance) / min(.$variance)))
## Source: local data frame [1 x 1]
##
## ratio
## (dbl)
## 1 2.874743
It would be cool to use Bayesian ANOVA to account for non-homogenous variances (see John Kruschke’s evangelizing), since it handles violations of ANOVA assumptions nicely. However, in his example, the ratio of min/max variance is huge, so it does lead to big differences in results:
#
# read_csv("http://www.indiana.edu/~kruschke/DoingBayesianDataAnalysis/Programs/NonhomogVarData.csv") %>%
# group_by(Group) %>%
# summarise(variance = var(Y)) %>%
# do(data_frame(ratio = max(.$variance) / min(.$variance)))
# # ratio = 64
#
With the variance issue handled, run the ANOVA:
importance.aov <- aov(avg_tier ~ importance_factor, data=df.importance)
summary(importance.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## importance_factor 3 5.41 1.803 8.712 1.25e-05 ***
## Residuals 454 93.96 0.207
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 62 observations deleted due to missingness
There is some significant difference between groups. Look at pairwise comparisons between all the groups to (kind of) decompose that finding:
(importance.pairs <- TukeyHSD(importance.aov, "importance_factor"))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = avg_tier ~ importance_factor, data = df.importance)
##
## $importance_factor
## diff lwr upr p adj
## Somewhat important actor-Most important actor -0.12673275 -0.2643637 0.010898224 0.0835093
## Not an important actor-Most important actor -0.28605296 -0.4776665 -0.094439366 0.0007784
## Don't know-Most important actor -0.26652163 -0.4194140 -0.113629236 0.0000522
## Not an important actor-Somewhat important actor -0.15932021 -0.3441336 0.025493209 0.1185914
## Don't know-Somewhat important actor -0.13978888 -0.2840675 0.004489721 0.0615006
## Don't know-Not an important actor 0.01953133 -0.1769115 0.215974193 0.9940858
Plot the differences:
importance.pairs.plot <- data.frame(importance.pairs$importance_factor) %>%
mutate(pair = row.names(.),
pair = factor(pair, levels=pair, ordered=TRUE),
stars = add.stars(p.adj))
fig.importance.pairs <- ggplot(importance.pairs.plot,
aes(x=pair, y=diff, ymax=upr, ymin=lwr)) +
geom_hline(yintercept=0) +
geom_text(aes(label=stars), nudge_x=0.25) +
geom_pointrange() +
theme_clean() + coord_flip()
fig.importance.pairs
Another way of checking group means in non-homogenous data is to use ordinal logistic regression. Here’s an ordered logit and corresponding predicted probabilities:
model.importance <- ordinal::clm(Q3.19 ~ avg_tier, data=df.importance,
link="logit", Hess=TRUE)
summary(model.importance)
## formula: Q3.19 ~ avg_tier
## data: df.importance
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 458 -590.92 1189.83 6(0) 7.09e-08 2.3e+02
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## avg_tier -0.8634 0.1813 -4.761 1.92e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## Most important actor|Somewhat important actor -2.6277 0.3703 -7.095
## Somewhat important actor|Not an important actor -1.0134 0.3512 -2.886
## Not an important actor|Don't know -0.4322 0.3495 -1.236
## (62 observations deleted due to missingness)
# Predicted probabilities
newdata <- data_frame(avg_tier = seq(0, 3, 0.1))
pred.importance <- predict(model.importance, newdata, interval=TRUE)
# Create plot data
pred.plot.lower <- cbind(newdata, pred.importance$lwr) %>%
gather(importance, lwr, -c(1:ncol(newdata)))
pred.plot.upper <- cbind(newdata, pred.importance$upr) %>%
gather(importance, upr, -c(1:ncol(newdata)))
pred.plot.data <- cbind(newdata, pred.importance$fit) %>%
gather(importance, importance_prob, -c(1:ncol(newdata))) %>%
left_join(pred.plot.lower, by=c("avg_tier", "importance")) %>%
left_join(pred.plot.upper, by=c("avg_tier", "importance"))
importance.colors <- c("grey20", "grey40", "grey60", "grey80")
ggplot(pred.plot.data, aes(x=avg_tier, y=importance_prob)) +
geom_ribbon(aes(ymax=upr, ymin=lwr, fill=importance),
alpha=0.2) +
geom_line(aes(colour=importance), size=2) +
scale_y_continuous(labels=percent) +
labs(x="Average tier rating in country",
y="Predicted probability of assigning importance") +
# scale_fill_manual(values=importance.colors, name=NULL) +
# scale_colour_manual(values=importance.colors, name=NULL) +
theme_clean()
Opinions of importance are not related to changes in TIP score. The average change in TIP rating is the same for each possible answer of importance.
ggplot(df.importance, aes(x=Q3.19, y=change_tip)) +
geom_violin(fill="grey90") +
geom_point(stat="summary", fun.y="mean", size=5) +
labs(x="Opinion of US", y="Change in TIP tier rating") +
coord_flip() + theme_clean()
Variance is equal in all groups:
kruskal.test(change_tip ~ importance_factor, data=df.importance)
##
## Kruskal-Wallis rank sum test
##
## data: change_tip by importance_factor
## Kruskal-Wallis chi-squared = 0.023921, df = 3, p-value = 0.999
ANOVA shows no differences:
change.anova <- aov(change_tip ~ importance_factor, data=df.importance)
summary(change.anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## importance_factor 3 0.11 0.03717 0.12 0.948
## Residuals 454 140.07 0.30851
## 62 observations deleted due to missingness
TukeyHSD(change.anova)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = change_tip ~ importance_factor, data = df.importance)
##
## $importance_factor
## diff lwr upr p adj
## Somewhat important actor-Most important actor -0.035714286 -0.2037504 0.1323219 0.9470518
## Not an important actor-Most important actor -0.040094340 -0.2740389 0.1938502 0.9711389
## Don't know-Most important actor -0.019495413 -0.2061645 0.1671737 0.9931627
## Not an important actor-Somewhat important actor -0.004380054 -0.2300221 0.2212620 0.9999549
## Don't know-Somewhat important actor 0.016218873 -0.1599335 0.1923713 0.9952866
## Don't know-Not an important actor 0.020598927 -0.2192418 0.2604396 0.9961638
Opinions of importance vary slightly by changes in Cho policy scores. Respondents who indicated that the US was more important tended to work in countries with greater changes in TIP policy.
ggplot(df.importance, aes(x=Q3.19, y=change_policy)) +
geom_violin(fill="grey90") +
geom_point(stat="summary", fun.y="mean", size=5) +
labs(x="Opinion of US", y="Change in TIP policy index") +
coord_flip() + theme_clean()
Variance is equal in all groups:
kruskal.test(change_policy ~ importance_factor, data=df.importance)
##
## Kruskal-Wallis rank sum test
##
## data: change_policy by importance_factor
## Kruskal-Wallis chi-squared = 6.992, df = 3, p-value = 0.07215
ANOVA almost shows some differences (biggest difference between “not important” and “most important” at p = 0.057):
cho.change.anova <- aov(change_policy ~ importance_factor, data=df.importance)
summary(cho.change.anova) # (⌐○Ϟ○) ♥ \(•◡•)/
## Df Sum Sq Mean Sq F value Pr(>F)
## importance_factor 3 50.4 16.791 2.478 0.0607 .
## Residuals 454 3076.6 6.777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 62 observations deleted due to missingness
TukeyHSD(cho.change.anova)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = change_policy ~ importance_factor, data = df.importance)
##
## $importance_factor
## diff lwr upr p adj
## Somewhat important actor-Most important actor -0.4921875 -1.2797322 0.29535721 0.3731027
## Not an important actor-Most important actor -1.0747347 -2.1711759 0.02170655 0.0570850
## Don't know-Most important actor -0.6286554 -1.5035282 0.24621744 0.2501130
## Not an important actor-Somewhat important actor -0.5825472 -1.6400768 0.47498247 0.4871951
## Don't know-Somewhat important actor -0.1364679 -0.9620513 0.68911557 0.9739739
## Don't know-Not an important actor 0.4460793 -0.6779958 1.57015434 0.7358876
Organizations that have been received funding from the US are more likely to consider the US to play an important role in the countries they work in.
funding.table <- df.importance %>%
xtabs(~ Q3.19 + received.funding, .) %>% print
## received.funding
## Q3.19 FALSE TRUE
## Most important actor 81 58
## Somewhat important actor 147 33
## Not an important actor 65 3
## Don't know 131 2
There’s an overall significant difference (though two of the cells are really small here)
(funding.chi <- chisq.test(funding.table))
##
## Pearson's Chi-squared test
##
## data: funding.table
## X-squared = 84.306, df = 3, p-value < 2.2e-16
# Cramer's V for standardized measure of association
assocstats(funding.table)
## X^2 df P(> X^2)
## Likelihood Ratio 91.724 3 0
## Pearson 84.306 3 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.374
## Cramer's V : 0.403
# Components of chi-squared
(components <- funding.chi$residuals^2)
## received.funding
## Q3.19 FALSE TRUE
## Most important actor 9.227019e+00 4.075267e+01
## Somewhat important actor 3.628447e-04 1.602564e-03
## Not an important actor 1.646209e+00 7.270758e+00
## Don't know 4.690586e+00 2.071675e+01
round(1-pchisq(components, funding.chi$parameter), 3)
## received.funding
## Q3.19 FALSE TRUE
## Most important actor 0.026 0.000
## Somewhat important actor 1.000 1.000
## Not an important actor 0.649 0.064
## Don't know 0.196 0.000
# Visualize differences
mosaic(funding.table,
labeling_args=list(set_varnames=c(received.funding="Received US funding",
Q3.19="Opinion of US"),
gp_labels=(gpar(fontsize=8))),
gp_varnames=gpar(fontsize=10, fontface=2))
Opinions of importance are strongly associated with US TIP funding given to a country. Organizations are more likely to think the US is an important actor if they work in countries receiving more anti-TIP funding.
ggplot(df.importance, aes(x=Q3.19, y=log.total.funding)) +
geom_violin(fill="grey90") +
geom_point(alpha=0.05) +
geom_point(stat="summary", fun.y="mean", size=5) +
labs(x="Opinion of US", y="Total TIP funding to country (logged)") +
scale_y_continuous(labels=trans_format("exp", dollar_format())) +
coord_flip() + theme_clean()
Variance is not equal in all groups:
kruskal.test(log.total.funding ~ importance_factor, data=df.importance)
##
## Kruskal-Wallis rank sum test
##
## data: log.total.funding by importance_factor
## Kruskal-Wallis chi-squared = 24.023, df = 3, p-value = 2.471e-05
Ratio is 2.8ish, which is below 4, so heterogenous variance is okayish:
df.importance %>% group_by(importance_factor) %>%
summarise(variance = var(log.total.funding, na.rm=TRUE)) %>%
do(data_frame(ratio = max(.$variance) / min(.$variance)))
## Source: local data frame [1 x 1]
##
## ratio
## (dbl)
## 1 2.853239
ANOVA shows significant differences:
funding.anova <- aov(log.total.funding ~ importance_factor, data=df.importance)
summary(funding.anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## importance_factor 3 1318 439.4 12.29 9.44e-08 ***
## Residuals 472 16881 35.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 44 observations deleted due to missingness
(funding.pairs <- TukeyHSD(funding.anova))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = log.total.funding ~ importance_factor, data = df.importance)
##
## $importance_factor
## diff lwr upr p adj
## Somewhat important actor-Most important actor -1.230395 -3.059836 0.5990475 0.3071960
## Not an important actor-Most important actor -4.689629 -7.085082 -2.2941758 0.0000038
## Don't know-Most important actor -3.520775 -5.507176 -1.5343735 0.0000369
## Not an important actor-Somewhat important actor -3.459235 -5.729837 -1.1886317 0.0005690
## Don't know-Somewhat important actor -2.290380 -4.124291 -0.4564688 0.0074669
## Don't know-Not an important actor 1.168854 -1.230014 3.5677226 0.5913210
See those differences
funding.pairs.plot <- data.frame(funding.pairs$importance_factor) %>%
mutate(pair = row.names(.),
pair = factor(pair, levels=pair, ordered=TRUE),
stars = add.stars(p.adj))
fig.funding.pairs <- ggplot(funding.pairs.plot,
aes(x=pair, y=diff, ymax=upr, ymin=lwr)) +
geom_hline(yintercept=0) +
geom_text(aes(label=stars), nudge_x=0.25) +
geom_pointrange() +
theme_clean() + coord_flip()
fig.funding.pairs
Organizations that have been involved with the US are more likely to consider the US to play an important role in the countries they work in.
involvement.table <- df.importance %>%
xtabs(~ Q3.19 + us.involvement, .) %>% print
## us.involvement
## Q3.19 FALSE TRUE
## Most important actor 23 116
## Somewhat important actor 48 132
## Not an important actor 38 30
## Don't know 73 60
There’s an overall significant difference
(involvement.chi <- chisq.test(involvement.table))
##
## Pearson's Chi-squared test
##
## data: involvement.table
## X-squared = 62.456, df = 3, p-value = 1.755e-13
# Cramer's V for standardized measure of association
assocstats(involvement.table)
## X^2 df P(> X^2)
## Likelihood Ratio 63.428 3 1.0880e-13
## Pearson 62.456 3 1.7553e-13
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.327
## Cramer's V : 0.347
# Components of chi-squared
(components <- involvement.chi$residuals^2)
## us.involvement
## Q3.19 FALSE TRUE
## Most important actor 13.523587 7.281931
## Somewhat important actor 3.571429 1.923077
## Not an important actor 8.472269 4.561991
## Don't know 15.029055 8.092568
1-pchisq(components, involvement.chi$parameter)
## us.involvement
## Q3.19 FALSE TRUE
## Most important actor 0.003630870 0.063434414
## Somewhat important actor 0.311615552 0.588524846
## Not an important actor 0.037196022 0.206827040
## Don't know 0.001791987 0.044136847
# Visualize differences
mosaic(involvement.table,
labeling_args=list(set_varnames=c(us.involvement="US involvement",
Q3.19="Opinion of US"),
gp_labels=(gpar(fontsize=8))),
gp_varnames=gpar(fontsize=10, fontface=2))
Does opinion of the US vary by:
ggplot(responses.full, aes(x=Q3.25_collapsed, y=total.funding, fill=Q3.25_collapsed)) +
geom_violin() +
geom_point(alpha=0.3, position=position_jitterdodge()) +
labs(x="Opinion of US", y="Total TIP funding to country") +
scale_y_continuous(labels=dollar)
ggplot(responses.full, aes(x=Q3.25, y=avg.funding, fill=Q3.25)) +
geom_violin() +
geom_point(alpha=0.3, position=position_jitterdodge()) +
labs(x="Opinion of US", y="Average TIP funding to country") +
scale_y_continuous(labels=dollar)
The same is true if looking just at US funding designated for just NGOs and NPOs
ggplot(responses.full, aes(x=Q3.25_collapsed, y=total.funding.ngos, fill=Q3.25_collapsed)) +
geom_violin() +
geom_point(alpha=0.3, position=position_jitterdodge()) +
labs(x="Opinion of US", y="Total NGO-only TIP funding to country") +
scale_y_continuous(labels=dollar)
ggplot(responses.full, aes(x=Q3.25_collapsed, y=avg.funding.ngos, fill=Q3.25_collapsed)) +
geom_violin() +
geom_point(alpha=0.3, position=position_jitterdodge()) +
labs(x="Opinion of US", y="Average NGO-only TIP funding to country") +
scale_y_continuous(labels=dollar)
plot.data <- responses.full %>% select(Q3.19, total.funding) %>% filter(!is.na(Q3.19))
ggplot(plot.data, aes(x=Q3.19, y=total.funding, fill=Q3.19)) +
geom_violin() +
geom_point(alpha=0.3, position=position_jitterdodge()) +
labs(x="US importance", y="Total TIP funding to country") +
scale_y_continuous(labels=dollar) +
coord_flip()
# Type of work
# TODO: work.country is not the most reliable identifier---there be NAs
# Q2.2_X
asdf <- responses.full %>%
select(survey.id, matches("Q2.2_\\d$")) %>%
gather(type_of_work, value, -survey.id) %>%
mutate(type_of_work = factor(type_of_work, levels=paste0("Q2.2_", seq(1:4)),
labels=c("Organs", "Sex", "Labor", "Other"),
ordered=TRUE))
asdf %>%
group_by(type_of_work) %>%
summarise(bloop = n(),
derp = sum(value, na.rm=TRUE),
asdf = derp / n())
## Source: local data frame [4 x 4]
##
## type_of_work bloop derp asdf
## (fctr) (int) (int) (dbl)
## 1 Organs 603 44 0.07296849
## 2 Sex 603 522 0.86567164
## 3 Labor 603 363 0.60199005
## 4 Other 603 146 0.24212272
# Should be 30, 408, 294, 116 with 479 total
# Q2.3_
# Q2.4_X
# Opinion of the US vs. importance
# Can't do this because 3.25 is censored by 3.19
responses.full %>%
xtabs(~ Q3.25 + Q3.19, .)
## Q3.19
## Q3.25 Most important actor Somewhat important actor Not an important actor Don't know
## Don't know 8 28 0 0
## Mixed 21 42 0 0
## Negative 2 0 0 0
## Positive 107 106 0 0
table(responses.full$Q3.25)
##
## Don't know Mixed Negative Positive
## 36 63 2 213
table(responses.full$Q3.19)
##
## Most important actor Somewhat important actor Not an important actor Don't know
## 139 180 68 133
sum(table(responses.full$Q3.25))
## [1] 314
sum(table(responses.full$Q3.19))
## [1] 520
responses.full$Q3.19 %>%
table %>% print %>% prop.table
## .
## Most important actor Somewhat important actor Not an important actor Don't know
## 139 180 68 133
## .
## Most important actor Somewhat important actor Not an important actor Don't know
## 0.2673077 0.3461538 0.1307692 0.2557692
# Find country averages of government improvement, etc. - then show that X number of countries show improvement, etc.
# Report by organization and by country - how many countries has the US had a positive influence + how many NGOs say the US has had a positive influence
full <- left_join(country.indexes, tip.change,
by=c("work.country" = "countryname")) %>%
filter(num.responses >= 10) %>%
mutate(country_label = ifelse(num.responses >= 10, work.country, ""))
ggplot(full, aes(x=improvement_score, y=change_policy, label=work.country)) +
geom_point() + geom_text(vjust=1.5) +
geom_hline(yintercept=0) +
scale_x_continuous(limits=c(0, 1)) +
scale_y_continuous(limits=c(-2, 6))
ggplot(country.indexes, aes(x=work.country, y=improvement_score)) +
geom_bar(stat="identity") +
coord_flip()
# Compare improvement scores with actual changes in TIP score to get a sense of if NGO experiences reflect changes in rankings
ggplot(country.indexes, aes(x=work.country, y=positivity_score)) +
geom_bar(stat="identity") +
coord_flip()
responses.countries %>%
xtabs(~ Q3.25 + Q3.19, .)
## Q3.19
## Q3.25 Most important actor Somewhat important actor Not an important actor Don't know
## Negative 2 0 0 0
## Positive 107 106 0 0
## Mixed 21 42 0 0
## Don't know 8 28 0 0
# Importance opinions
importance.opinions <- responses.all %>%
filter(Q3.19 == "Not an important actor") %>%
select(survey.id, clean.id, Q3.19, contains("TEXT"), Q4.1)
responses.all$Q3.19 %>%
table %>% print %>% prop.table
## .
## Most important actor Somewhat important actor Not an important actor Don't know
## 139 180 68 133
## .
## Most important actor Somewhat important actor Not an important actor Don't know
## 0.2673077 0.3461538 0.1307692 0.2557692
responses.countries %>%
xtabs(~ Q3.25 + Q3.26, .)
## Q3.26
## Q3.25 Improved Remained constant Slowed down Don't know
## Negative 2 0 0 0
## Positive 149 39 22 3
## Mixed 32 16 13 2
## Don't know 19 8 6 3
ggplot(responses.orgs, aes(x = Q1.5.factor)) + geom_bar() +
labs(x = Hmisc::label(responses.orgs$Q1.5))
# Importance of US
asdf <- responses.all %>%
select(clean.id, Q1.2, Q3.8, Q3.6, Q3.7)
inconsistent.no <- c(1020, 1152, 1226, 1267, 1323, 1405, 1515)
inconsistent.dont.know <- c(1051, 1512)
qwer <- asdf %>%
mutate(us.active = ifelse(clean.id %in% c(inconsistent.no, inconsistent.dont.know),
"Yes", as.character(Q3.8)))
qwer$us.active %>% table %>% print %>% prop.table
## .
## Don't know No Yes
## 147 32 352
## .
## Don't know No Yes
## 0.27683616 0.06026365 0.66290019
sdfg <- qwer %>% group_by(clean.id) %>%
summarize(said.no = ifelse(any(us.active == "No", na.rm=TRUE), TRUE, FALSE))
sdfg$said.no %>% table %>% print %>% prop.table
## .
## FALSE TRUE
## 491 31
## .
## FALSE TRUE
## 0.94061303 0.05938697
# US importance and positivity
# Importance of report
responses.orgs$Q2.5 %>% table %>% print %>% prop.table
## .
## No Yes
## 67 452
## .
## No Yes
## 0.1290944 0.8709056
responses.countries$Q3.23 %>% table %>% print %>% prop.table
## .
## No Yes
## 265 203
## .
## No Yes
## 0.5662393 0.4337607
heard.of.tip <- responses.countries %>%
left_join(responses.orgs, by="survey.id") %>%
filter(Q2.5 == "Yes") %>%
group_by(survey.id) %>%
mutate(know.score = ifelse(Q3.22 == "Don't know", FALSE, TRUE)) %>%
select(know.score) %>% unique
heard.of.tip$know.score %>% table %>% print %>% prop.table
## .
## FALSE TRUE
## 98 309
## .
## FALSE TRUE
## 0.2407862 0.7592138
# Opinions of report
opinions <- responses.all %>%
select(clean.id, Q1.2, home.country, work.country, Q3.21_1, Q3.21_4_TEXT, Q3.24.Text)
not.used.tip.ids <- c(1094, 1099, 1106, 1114, 1157, 1221, 1244, 1269,
1314, 1330, 1354, 1357, 1393, 1425)
not.used.tip <- responses.all %>%
mutate(no.response = ifelse(is.na(Q3.21_1) & is.na(Q3.21_2) &
is.na(Q3.21_3) & is.na(Q3.21_4), TRUE, FALSE),
explicit.no = ifelse(clean.id %in% not.used.tip.ids, TRUE, FALSE)) %>%
select(clean.id, Q1.2, Q3.21_1, Q3.21_2, Q3.21_3, Q3.21_4, no.response, explicit.no) %>%
group_by(clean.id) %>%
summarize(no.response = ifelse(sum(no.response) > 0, TRUE, FALSE),
explicit.no = ifelse(sum(explicit.no) > 0, TRUE, FALSE))
not.used.tip$explicit.no %>% table
## .
## FALSE TRUE
## 508 14
# Does opinion of the US vary by:
# * Tier rating (average) or improvement in Cho score?
# * Whether an NGO has received US funding (or where the COUNTRY has received more TIP grants?)
# * Whether an NGO has interacted with the US
# * Whether a country is rich or poor (or some other quality)
# * Whether an NGO focuses on certain types of work?
# * In which countries does the US seem to have had more collaboration with NGOs?